In [1]:
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')
import numpy as np
Taken from the PyMC3 webpage (http://docs.pymc.io/notebooks/getting_started.html) We are interested in predicting outcomes Y as normally-distributed observations with an expected value μ that is a linear function of two predictor variables, X1 and X2
$$ Y \sim N(\mu, \sigma^2) $$$$ \mu = \alpha + \beta_1 X_1 + \beta_2X_2 $$where α is the intercept, and βi is the coefficient for covariate Xi, while σ represents the observation error. Since we are constructing a Bayesian model, the unknown variables in the model must be assigned a prior distribution. We choose zero-mean normal priors with variance of 100 for both regression coefficients, which corresponds to weak information regarding the true parameter values. We choose a half-normal distribution (normal distribution bounded at zero) as the prior for σ.
$$ \alpha \sim N(0,100) $$$$ \beta_i \sim N(0,100) $$$$ \sigma \sim N(0,1) $$
In [2]:
import numpy as np
import matplotlib.pyplot as plt
# Initialize random number generator
np.random.seed(123)
# True parameter values
alpha, sigma = 1, 1
beta = [1, 2.5]
# Size of dataset
size = 100
# Predictor variable
X1 = np.random.randn(size)
X2 = np.random.randn(size) * 0.2
# Simulate outcome variable
Y = alpha + beta[0]*X1 + beta[1]*X2 + np.random.randn(size)*sigma
In [3]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10,4))
axes[0].scatter(X1, Y)
axes[1].scatter(X2, Y)
axes[0].set_ylabel('Y'); axes[0].set_xlabel('X1'); axes[1].set_xlabel('X2');
In [4]:
#import os
#os.environ['MKL_THREADING_LAYER'] = 'GNU'
In [5]:
## Model Specification
import pymc3 as pm
In [6]:
#basic_model = pm.Model()
#with basic_model:
# The priors of the "unknown model"
# alpha = pm.Normal('alpha',mu=0.0,sd=10)
## shape tell that it's a vector of size 2
# beta = pm.Normal('beta',mu=0.0, sd=10,shape=2)
# sigma = pm.HalfNormal('sigma', sd=1.0)
# Expected value of outcome
# mu = alpha + beta[0]*X1 + beta[1]*X2
# Likelihood (sampling observations)
# Y_obs = pm.Normal('Y_obs',mu=mu,sd=sigma,observed=Y)
basic_model = pm.Model()
with basic_model:
# Priors for unknown model parameters
alpha = pm.Normal('alpha', mu=0, sd=10)
beta = pm.Normal('beta', mu=0, sd=10, shape=2)
sigma = pm.HalfNormal('sigma', sd=1)
# Expected value of outcome
mu = alpha + beta[0]*X1 + beta[1]*X2
# Likelihood (sampling distribution) of observations
Y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=Y)
In [7]:
%time map_estimate = pm.find_MAP(model=basic_model)
In [8]:
map_estimate
Out[8]:
In [9]:
from scipy import optimize
map_estimate = pm.find_MAP(model=basic_model, fmin=optimize.fmin_powell)
map_estimate
Out[9]:
In [10]:
from scipy import optimize
with basic_model:
trace = pm.sample()
In [11]:
trace['alpha'][-5:]
Out[11]:
If we wanted to use the slice sampling algorithm to sigma instead of NUTS (which was assigned automatically), we could have specified this as the step argument for sample.
In [12]:
with basic_model:
# obtain starting values via MAP
start = pm.find_MAP(fmin=optimize.fmin_powell)
# instantiate sampler
step = pm.Slice()
# draw 5000 posterior samples
trace = pm.sample(5000, step=step, start=start)
In [13]:
_ = pm.traceplot(trace)
We present a case study of stochastic volatility, time varying stock market volatility, to illustrate PyMC3’s use in addressing a more realistic problem. The distribution of market returns is highly non-normal, which makes sampling the volatilities significantly more difficult. This example has 400+ parameters so using common sampling algorithms like Metropolis-Hastings would get bogged down, generating highly autocorrelated samples. Instead, we use NUTS, which is dramatically more efficient. The Model
Asset prices have time-varying volatility (variance of day over day returns). In some periods, returns are highly variable, while in others they are very stable. Stochastic volatility models address this with a latent volatility variable, which changes over time. The following model is similar to the one described in the NUTS paper (Hoffman 2014, p. 21).
$$ \sigma \sim exp(50) $$$$ \nu \sim exp(0.1) $$$$ s_i \sim N(s_{i-1},\sigma^{-2}) $$$$ log(y_i) \sim t(\nu,0,exp(-2s_i)) $$Here, y is the daily return series which is modeled with a Student-t distribution with an unknown degrees of freedom parameter, and a scale parameter determined by a latent process s. The individual si are the individual daily log volatilities in the latent log volatility process.
In [15]:
import pandas_datareader.data as web
import pandas as pd
In [41]:
import pandas_datareader.data as web
import datetime
# Original
start = datetime.datetime(2008, 5, 1)
end = datetime.datetime(2009,12,1)
##Meine parameters
start = datetime.datetime(2018, 1, 1)
end = datetime.datetime(2018,2,12)
returns = web.DataReader('SPY', 'google', start, end)['Close'].pct_change()
#f.ix['2010-01-04']
In [42]:
returns.plot(figsize=(10, 6))
plt.ylabel('daily returns in %');
In [43]:
with pm.Model() as sp500_model:
nu = pm.Exponential('nu', 1./10, testval=5.)
sigma = pm.Exponential('sigma', 1./.02, testval=.1)
s = pm.GaussianRandomWalk('s', sigma**-2, shape=len(returns))
volatility_process = pm.Deterministic('volatility_process', pm.math.exp(-2*s))
r = pm.StudentT('r', nu, lam=1/volatility_process, observed=returns)
In [44]:
with sp500_model:
trace = pm.sample(2000)
In [45]:
pm.traceplot(trace, [nu, sigma]);
In [46]:
fig, ax = plt.subplots(figsize=(15, 8))
returns.plot(ax=ax)
ax.plot(returns.index, 1/np.exp(trace['s',::5].T), 'r', alpha=.03);
ax.set(title='volatility_process', xlabel='time', ylabel='volatility');
ax.legend(['S&P500', 'stochastic volatility process'])
Out[46]:
Consider the following time series of recorded coal mining disasters in the UK from 1851 to 1962 (Jarrett, 1979). The number of disasters is thought to have been affected by changes in safety regulations during this period. Unfortunately, we also have pair of years with missing data, identified as missing by a NumPy MaskedArray using -999 as the marker value.
Next we will build a model for this series and attempt to estimate when the change occurred. At the same time, we will see how to handle missing data, use multiple samplers and sample from discrete random variables.
In [47]:
disaster_data = np.ma.masked_values([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
2, 2, 3, 4, 2, 1, 3, -999, 2, 1, 1, 1, 1, 3, 0, 0,
1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
3, 3, 1, -999, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1], value=-999)
year = np.arange(1851, 1962)
plt.plot(year, disaster_data, 'o', markersize=8);
plt.ylabel("Disaster count")
plt.xlabel("Year")
Out[47]:
the parameters are defined as follows:
In [54]:
with pm.Model() as disaster_model:
switchpoint = pm.DiscreteUniform('switchpoint', lower=year.min(), upper=year.max(), testval=1900)
# Priors for pre- and post-switch rates number of disasters
early_rate = pm.Exponential('early_rate', 1)
late_rate = pm.Exponential('late_rate', 1)
# Allocate appropriate Poisson rates to years before and after current
rate = pm.math.switch(switchpoint >= year, early_rate, late_rate)
disasters = pm.Poisson('disasters', rate, observed=disaster_data)
In [55]:
with disaster_model:
trace = pm.sample(10000)
In [56]:
pm.traceplot(trace)
Out[56]:
In [ ]: